# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

import os
import re
import sys
from data_juicer import __version__ as version


CURRENT_VERSION = os.environ.get("DOCS_VERSION", "")
GIT_REF_FOR_LINKS = os.environ.get("GIT_REF_FOR_LINKS", "main")
AVAILABLE_VERSIONS = [v for v in os.environ.get("AVAILABLE_VERSIONS", "").split(",") if v]
REPO_ROOT = os.environ.get("REPO_ROOT")
CODE_ROOT = os.environ.get("CODE_ROOT")

release = version

# -- Path setup --------------------------------------------------------------
current_dir = os.path.dirname(__file__)
if CODE_ROOT and os.path.isdir(CODE_ROOT):
    sys.path.insert(0, os.path.abspath(CODE_ROOT))
else:
    sys.path.insert(0, os.path.abspath("../../"))
sys.path.insert(0, current_dir)

from custom_myst import ReplaceVideoLinksTransform

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "Data-Juicer"
copyright = "2024, Data-Juicer Team"
author = "Data-Juicer Team"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}

extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.viewcode",
    "sphinx.ext.napoleon",
    "sphinx.ext.autosectionlabel",
    "myst_parser",
    "sphinx_copybutton",
]

# -- Extension configuration ------------------------------------------------
myst_heading_anchors = 4
myst_enable_extensions = [
    "linkify",
    "tasklist",
]

# Prefix document path to section labels, otherwise autogenerated labels would
# look like 'heading' rather than 'path/to/file:heading'
autosectionlabel_prefix_document = True
autosummary_generate = True
autosummary_ignore_module_all = False
autodoc_member_order = "bysource"

# -- Templates and patterns -------------------------------------------------
templates_path = ["_templates"]
exclude_patterns = ["build", "demos/process_video_on_ray/data/Note.md"]

# -- Options for HTML output ------------------------------------------------
# The theme to use for HTML and HTML Help pages.
# See the documentation for a list of builtin themes.
html_theme = "furo"
html_title = "Data Juicer"

# Favicon and logo configuration
html_favicon = "_static/images/icon.png"
html_logo = "_static/images/logo.png"

# Sidebar configuration
html_sidebars = {
    "**": [
        "sidebar/brand.html",
        "sidebar/search.html",
        "sidebar/scroll-start.html",
        "sidebar/navigation.html",
        "sidebar/scroll-end.html",
        "sidebar/bottom_menu.html",
    ],
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_css_files = ["sidebar-menu.css"]
html_static_path = ["_static"]
html_extra_path = ['extra']

# -- Internationalization settings ------------------------------------------
# Language settings
language = "en"
locale_dirs = ["locale/"]  # path is example but recommended
gettext_compact = False  # optional

# List of supported languages
supported_languages = {
    "en": "English",
    "zh_CN": "简体中文",
    # 'ja': '日本語',
}


def get_lang_link(language, pagename, lang_code, non_zh_pages=[], current_version=""):
    def norm_pagename(p):
        return os.path.normpath(p)

    norm_non_zh_pages = set(map(norm_pagename, non_zh_pages))
    target_page = pagename

    if "CN" in language and pagename.endswith("_ZH") and "CN" not in lang_code:
        target_page = pagename[:-3]
    if "CN" in lang_code and not pagename.endswith("_ZH"):
        if norm_pagename(pagename) not in norm_non_zh_pages:
            target_page += "_ZH"

    return f"{lang_code}/{current_version}/{target_page}.html"


html_context = {
    "supported_languages": supported_languages,
    "get_lang_link": get_lang_link,
    "current_version": CURRENT_VERSION,
    "available_versions": AVAILABLE_VERSIONS,
}


# -- setup configuration ------------------------------------------------
def find_zh_exclusions(app, config):
    """
    Find Chinese translation files to exclude when building English documentation
    """
    non_zh_pages = set()
    zh_exclusions = []

    for root, dirs, files in os.walk(app.srcdir):
        for file in files:
            # Check for files with English base names and corresponding _ZH versions
            if not file.endswith(("_ZH.md", "_ZH.rst")):
                base_name, ext = os.path.splitext(file)
                zh_file = f"{base_name}_ZH{ext}"
                zh_file_path = os.path.join(root, zh_file)
                rel_path = os.path.normpath(os.path.relpath(os.path.join(root, file), app.srcdir))

                # If Chinese version exists, add to exclusions
                if os.path.exists(zh_file_path):
                    zh_exclusions.append(rel_path)
                else:
                    non_zh_pages.add(os.path.normpath(os.path.relpath(os.path.join(root, base_name), app.srcdir)))

    if config.language == "zh_CN":
        config.exclude_patterns.extend(zh_exclusions)
    else:
        config.exclude_patterns.extend(["*_ZH*", "**/*_ZH*"])

    app.config.html_context.setdefault("non_zh_pages", set()).update(non_zh_pages)


def rebuild_source_dir(app, config):
    """Rebuild source directory for documentation"""
    # Find Chinese translation files
    find_zh_exclusions(app, config)


def skip(app, what, name, obj, would_skip, options):
    """Control which members to skip in documentation"""
    if name == "__init__":
        return False
    return would_skip


def process_doc_links(app, docname, source):
    """Process and update documentation links"""
    repo_base = f"https://github.com/datajuicer/data-juicer/blob/{GIT_REF_FOR_LINKS}/"

    def link_replacer(match):
        text, path = match.group(1), match.group(2)
        abs_path = os.path.normpath(os.path.join(os.path.dirname(docname), path))
        return f"[{text}]({repo_base}{abs_path})"

    pattern = r"\[([^\]]+)\]\((?!http|#)([^)]*(?<!\.md)(?<!\.rst))\)"
    source[0] = re.sub(pattern, link_replacer, source[0])
    return source[0]


def process_tutorial(app, docname, source):
    """Process tutorial during reading"""
    overview_placeholder = ""
    if app.config.language == "zh_CN":
        overview_placeholder = "- [DJ概览](../../README_ZH.md)"
    else:
        overview_placeholder = "- [Overview of DJ](../../README.md)"
    source[0] = source[0].replace(overview_placeholder, "")
    pattern = r"(?i)\nen[A-Za-z\s]{0,12}\|\s*\[\u4e2d\u6587[\u4e00-\u9fa5\s]{0,12}\]\([^)]+\.md\)|\n\u4e2d\u6587[\u4e00-\u9fa5\s]{0,12}\|\s*\[en[A-Za-z\s]{0,12}\]\([^)]+\.md\)"
    source[0] = re.sub(pattern, "", source[0])
    return source[0]


def process_read(app, docname, source):
    """Process document during reading"""
    source[0] = process_tutorial(app, docname, source)
    source[0] = process_doc_links(app, docname, source)


def setup(app):
    """Setup Sphinx application hooks"""
    app.add_transform(ReplaceVideoLinksTransform)
    app.connect("config-inited", rebuild_source_dir)
    app.config.root_doc = "index_ZH" if app.config.language == "zh_CN" else "index"

    app.connect("source-read", process_read)
    app.connect("autodoc-skip-member", skip)
